In [ ]:
 
In [1]:
#imports
import datetime
import requests
import warnings

import matplotlib
import matplotlib.dates as mdates
import seaborn as sns
import plotly.offline as py
import plotly_express as px

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, add_changepoints_to_plot
from itertools import cycle, islice


from IPython.display import Image
warnings.filterwarnings('ignore')
%matplotlib inline



import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import folium
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bar_chart_race as bcr
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
import seaborn as sns

%matplotlib inline

import math
import random
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')


#color pallette
cnf='#39e46'
dth='#ff2e63'
rec='#21bf73'
act='#fe9801'
C:\Users\Himanshu\Anaconda3\lib\site-packages\dask\config.py:168: YAMLLoadWarning:

calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.

C:\Users\Himanshu\Anaconda3\lib\site-packages\dask\dataframe\utils.py:13: FutureWarning:

pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.

C:\Users\Himanshu\Anaconda3\lib\site-packages\distributed\config.py:20: YAMLLoadWarning:

calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.

C:\Users\Himanshu\Anaconda3\lib\site-packages\statsmodels\compat\pandas.py:49: FutureWarning:

The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version

In [7]:
 
In [4]:
 
In [5]:
 
In [ ]:
labels = ['Missing', 'Male', 'Female']
sizes = []
sizes.append(individual_details['gender'].isnull().sum())
sizes.append(list(individual_details['gender'].value_counts())[0])
sizes.append(list(individual_details['gender'].value_counts())[1])

explode = (0, 0.1, 0)
colors = ['#ffcc99','#66b3ff','#ff9999']

plt.figure(figsize= (15,10))
plt.title('Percentage of Gender',fontsize = 20)
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',shadow=True, startangle=90)
plt.axis('equal')
plt.tight_layout()
In [2]:
#DATA PREPARATION
In [3]:
import plotly as py
py.offline.init_notebook_mode(connected=True)
In [4]:
import os
In [5]:
try:
      os.system("rm -rf Covid-19-Preprocessed-Dataset")
except:
      print("file does not exist")
In [6]:
df=pd.read_csv('covid_19_data_cleaned.csv',parse_dates=['Date'])
country_daywise=pd.read_csv('country_daywise.csv',parse_dates=['Date'])
countrywise=pd.read_csv('countrywise.csv')
daywise=pd.read_csv('daywise.csv',parse_dates=['Date'])
In [7]:
df.head()
Out[7]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active
0 2020-01-22 NaN Afghanistan 33.93911 67.709953 0 0 0 0
1 2020-01-23 NaN Afghanistan 33.93911 67.709953 0 0 0 0
2 2020-01-24 NaN Afghanistan 33.93911 67.709953 0 0 0 0
3 2020-01-25 NaN Afghanistan 33.93911 67.709953 0 0 0 0
4 2020-01-26 NaN Afghanistan 33.93911 67.709953 0 0 0 0
In [8]:
df['Province/State']=df['Province/State'].fillna("")
df.head()
Out[8]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active
0 2020-01-22 Afghanistan 33.93911 67.709953 0 0 0 0
1 2020-01-23 Afghanistan 33.93911 67.709953 0 0 0 0
2 2020-01-24 Afghanistan 33.93911 67.709953 0 0 0 0
3 2020-01-25 Afghanistan 33.93911 67.709953 0 0 0 0
4 2020-01-26 Afghanistan 33.93911 67.709953 0 0 0 0
In [49]:
df['ProvinceID'] = le.fit_transform(df['Province/State'])
df['CountryID']=le.fit_transform(df['Country'])
df.head()
Out[49]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active ProvinceID CountryID
0 2020-01-22 Afghanistan 33.93911 67.709953 0 0 0 0 0 0
1 2020-01-23 Afghanistan 33.93911 67.709953 0 0 0 0 0 0
2 2020-01-24 Afghanistan 33.93911 67.709953 0 0 0 0 0 0
3 2020-01-25 Afghanistan 33.93911 67.709953 0 0 0 0 0 0
4 2020-01-26 Afghanistan 33.93911 67.709953 0 0 0 0 0 0
In [52]:
corr= df.corr()
sns.heatmap(corr,annot=True)
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x1cad5b3a128>
In [53]:
#There is no strong correlation between any of the variables except for Confirmed and Deaths variables (0.76)
In [54]:
fig = plt.figure(figsize=(10,10))
conf_per_country = df.groupby('Country')['Confirmed'].sum().sort_values(ascending=False)
conf_sum=df['Confirmed'].sum()
def absolute_value(val):
    a  = val
    return (np.round(a,2))
conf_per_country.plot(kind="pie",title='Percentage of confirmed cases per country',autopct=absolute_value)

plt.show ()
In [57]:
group_cases=df[['Confirmed','Recovered','Deaths','Country']].groupby('Country').max().sort_values('Confirmed',ascending=False).head()
group_cases=group_cases.reset_index()
group_cases
Out[57]:
Country Confirmed Recovered Deaths
0 US 5197118 1753760 166026
1 Brazil 3164785 2506228 104201
2 India 2396637 1695982 47033
3 Russia 900745 708900 15231
4 South Africa 568919 432029 11010
In [58]:
#Among the top 5 countries with maximum confirmed cases Russia is doing better followed by India if we consider both the recovery and the death rate
In [59]:
group_cases['Recovery Rate']=round(group_cases['Recovered'] / group_cases['Confirmed'],2)
group_cases['Death Rate']=round(group_cases['Deaths'] / group_cases['Confirmed'],2)
group_cases=group_cases.sort_values(by='Confirmed', ascending= False)
group_cases.style.background_gradient(cmap='Greens')
Out[59]:
Country Confirmed Recovered Deaths Recovery Rate Death Rate
0 US 5197118 1753760 166026 0.34 0.03
1 Brazil 3164785 2506228 104201 0.79 0.03
2 India 2396637 1695982 47033 0.71 0.02
3 Russia 900745 708900 15231 0.79 0.02
4 South Africa 568919 432029 11010 0.76 0.02
In [9]:
country_daywise.head()
Out[9]:
Date Country Confirmed Deaths Recovered Active New Cases New Deaths New Recovered
0 2020-01-22 Afghanistan 0 0 0 0 0 0 0
1 2020-01-22 Albania 0 0 0 0 0 0 0
2 2020-01-22 Algeria 0 0 0 0 0 0 0
3 2020-01-22 Andorra 0 0 0 0 0 0 0
4 2020-01-22 Angola 0 0 0 0 0 0 0
In [10]:
countrywise.head()
Out[10]:
Country Confirmed Deaths Recovered Active New Cases Deaths / 100 Cases Recovered / 100 Cases Deaths / 100 Recovered Population Cases / Million People Confirmed last week 1 week change 1 week % increase
0 Afghanistan 37054 1312 25960 9782 0 3.54 70.06 5.05 38928341 952.0 36829 516 1.40
1 Albania 6411 199 3342 2870 136 3.10 52.13 5.95 2877800 2228.0 5889 928 15.76
2 Algeria 35160 1302 24506 9352 467 3.70 69.70 5.31 43851043 802.0 33055 3644 11.02
3 Andorra 955 52 839 64 0 5.45 87.85 6.20 77265 12360.0 939 38 4.05
4 Angola 1672 75 567 1030 100 4.49 33.91 13.23 32866268 51.0 1395 367 26.31
In [11]:
daywise.head()
Out[11]:
Date Confirmed Deaths Recovered Active New Cases Deaths / 100 Cases Recovered / 100 Cases Deaths / 100 Recovered No. of Countries
0 2020-01-22 555 17 28 510 0 3.06 5.05 60.71 6
1 2020-01-23 654 18 30 606 99 2.75 4.59 60.00 8
2 2020-01-24 941 26 36 879 287 2.76 3.83 72.22 9
3 2020-01-25 1434 42 39 1353 493 2.93 2.72 107.69 11
4 2020-01-26 2118 56 52 2010 684 2.64 2.46 107.69 13
In [12]:
confirmed=df.groupby('Date').sum()['Confirmed'].reset_index()
recovered=df.groupby('Date').sum()['Recovered'].reset_index()
deaths=df.groupby('Date').sum()['Deaths'].reset_index()
deaths.head()
Out[12]:
Date Deaths
0 2020-01-22 17
1 2020-01-23 18
2 2020-01-24 26
3 2020-01-25 42
4 2020-01-26 56
In [13]:
df.isnull().sum()
Out[13]:
Date              0
Province/State    0
Country           0
Lat               0
Long              0
Confirmed         0
Recovered         0
Deaths            0
Active            0
dtype: int64
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55488 entries, 0 to 55487
Data columns (total 9 columns):
Date              55488 non-null datetime64[ns]
Province/State    55488 non-null object
Country           55488 non-null object
Lat               55488 non-null float64
Long              55488 non-null float64
Confirmed         55488 non-null int64
Recovered         55488 non-null int64
Deaths            55488 non-null int64
Active            55488 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 3.8+ MB
In [15]:
df.query('Country=="US"')
Out[15]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active
49368 2020-01-22 US 40.0 -100.0 1 0 0 1
49369 2020-01-23 US 40.0 -100.0 1 0 0 1
49370 2020-01-24 US 40.0 -100.0 2 0 0 2
49371 2020-01-25 US 40.0 -100.0 2 0 0 2
49372 2020-01-26 US 40.0 -100.0 5 0 0 5
49373 2020-01-27 US 40.0 -100.0 5 0 0 5
49374 2020-01-28 US 40.0 -100.0 5 0 0 5
49375 2020-01-29 US 40.0 -100.0 5 0 0 5
49376 2020-01-30 US 40.0 -100.0 5 0 0 5
49377 2020-01-31 US 40.0 -100.0 7 0 0 7
49378 2020-02-01 US 40.0 -100.0 8 0 0 8
49379 2020-02-02 US 40.0 -100.0 8 0 0 8
49380 2020-02-03 US 40.0 -100.0 11 0 0 11
49381 2020-02-04 US 40.0 -100.0 11 0 0 11
49382 2020-02-05 US 40.0 -100.0 11 0 0 11
49383 2020-02-06 US 40.0 -100.0 11 0 0 11
49384 2020-02-07 US 40.0 -100.0 11 0 0 11
49385 2020-02-08 US 40.0 -100.0 11 0 0 11
49386 2020-02-09 US 40.0 -100.0 11 3 0 8
49387 2020-02-10 US 40.0 -100.0 11 3 0 8
49388 2020-02-11 US 40.0 -100.0 12 3 0 9
49389 2020-02-12 US 40.0 -100.0 12 3 0 9
49390 2020-02-13 US 40.0 -100.0 13 3 0 10
49391 2020-02-14 US 40.0 -100.0 13 3 0 10
49392 2020-02-15 US 40.0 -100.0 13 3 0 10
49393 2020-02-16 US 40.0 -100.0 13 3 0 10
49394 2020-02-17 US 40.0 -100.0 13 3 0 10
49395 2020-02-18 US 40.0 -100.0 13 3 0 10
49396 2020-02-19 US 40.0 -100.0 13 3 0 10
49397 2020-02-20 US 40.0 -100.0 13 3 0 10
... ... ... ... ... ... ... ... ... ...
49542 2020-07-14 US 40.0 -100.0 3431574 1049098 137045 2245431
49543 2020-07-15 US 40.0 -100.0 3498902 1075882 138013 2285007
49544 2020-07-16 US 40.0 -100.0 3576157 1090645 138966 2346546
49545 2020-07-17 US 40.0 -100.0 3647715 1107204 139886 2400625
49546 2020-07-18 US 40.0 -100.0 3711413 1122720 140756 2447937
49547 2020-07-19 US 40.0 -100.0 3773260 1131121 141201 2500938
49548 2020-07-20 US 40.0 -100.0 3834677 1160087 141715 2532875
49549 2020-07-21 US 40.0 -100.0 3899211 1182018 142824 2574369
49550 2020-07-22 US 40.0 -100.0 3970121 1210849 144035 2615237
49551 2020-07-23 US 40.0 -100.0 4038816 1233269 145156 2660391
49552 2020-07-24 US 40.0 -100.0 4112531 1261624 146279 2704628
49553 2020-07-25 US 40.0 -100.0 4178970 1279414 147180 2752376
49554 2020-07-26 US 40.0 -100.0 4233923 1297863 147657 2788403
49555 2020-07-27 US 40.0 -100.0 4290337 1325804 148782 2815751
49556 2020-07-28 US 40.0 -100.0 4356206 1355363 150150 2850693
49557 2020-07-29 US 40.0 -100.0 4426982 1389425 151586 2885971
49558 2020-07-30 US 40.0 -100.0 4495015 1414155 152802 2928058
49559 2020-07-31 US 40.0 -100.0 4562107 1438160 154048 2969899
49560 2020-08-01 US 40.0 -100.0 4620592 1461885 155159 3003548
49561 2020-08-02 US 40.0 -100.0 4668172 1468689 155565 3043918
49562 2020-08-03 US 40.0 -100.0 4713540 1513446 156104 3043990
49563 2020-08-04 US 40.0 -100.0 4771080 1528979 157482 3084619
49564 2020-08-05 US 40.0 -100.0 4823890 1577851 158854 3087185
49565 2020-08-06 US 40.0 -100.0 4883582 1598624 160104 3124854
49566 2020-08-07 US 40.0 -100.0 4941755 1623870 161347 3156538
49567 2020-08-08 US 40.0 -100.0 4997929 1643118 162423 3192388
49568 2020-08-09 US 40.0 -100.0 5044864 1656864 162938 3225062
49569 2020-08-10 US 40.0 -100.0 5094400 1670755 163463 3260182
49570 2020-08-11 US 40.0 -100.0 5141208 1714960 164527 3261721
49571 2020-08-12 US 40.0 -100.0 5197118 1753760 166026 3277332

204 rows × 9 columns

In [ ]:
 

WORLDWIDE TOTAL CONFIRMED,RECOVERED AND DEATHS

In [16]:
confirmed.tail()
Out[16]:
Date Confirmed
199 2020-08-08 19637506
200 2020-08-09 19861683
201 2020-08-10 20089624
202 2020-08-11 20344188
203 2020-08-12 20620847
In [17]:
recovered.tail()
Out[17]:
Date Recovered
199 2020-08-08 11939109
200 2020-08-09 12115825
201 2020-08-10 12280520
202 2020-08-11 12585473
203 2020-08-12 12826815
In [18]:
deaths.tail()
Out[18]:
Date Deaths
199 2020-08-08 726781
200 2020-08-09 731326
201 2020-08-10 736191
202 2020-08-11 742700
203 2020-08-12 749358
In [19]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=confirmed['Date'],y=confirmed['Confirmed'],mode='lines+markers',name='Confirmed',line=dict(color="Orange",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=recovered['Recovered'],mode='lines+markers',name='Recovered',line=dict(color="Green",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=deaths['Deaths'],mode='lines+markers',name='Deaths',line=dict(color="Red",width=4)))
fig.update_layout(title='Worldwide COVID19 Cases',xaxis_tickfont_size=14,yaxis=dict(title='Number of cases'))
fig.show()
In [20]:
#CASES DENSITY
In [21]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55488 entries, 0 to 55487
Data columns (total 9 columns):
Date              55488 non-null datetime64[ns]
Province/State    55488 non-null object
Country           55488 non-null object
Lat               55488 non-null float64
Long              55488 non-null float64
Confirmed         55488 non-null int64
Recovered         55488 non-null int64
Deaths            55488 non-null int64
Active            55488 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 3.8+ MB
In [22]:
df['Date']=df['Date'].astype(str)
In [23]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55488 entries, 0 to 55487
Data columns (total 9 columns):
Date              55488 non-null object
Province/State    55488 non-null object
Country           55488 non-null object
Lat               55488 non-null float64
Long              55488 non-null float64
Confirmed         55488 non-null int64
Recovered         55488 non-null int64
Deaths            55488 non-null int64
Active            55488 non-null int64
dtypes: float64(2), int64(4), object(3)
memory usage: 3.8+ MB
In [24]:
df.head()
Out[24]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active
0 2020-01-22 Afghanistan 33.93911 67.709953 0 0 0 0
1 2020-01-23 Afghanistan 33.93911 67.709953 0 0 0 0
2 2020-01-24 Afghanistan 33.93911 67.709953 0 0 0 0
3 2020-01-25 Afghanistan 33.93911 67.709953 0 0 0 0
4 2020-01-26 Afghanistan 33.93911 67.709953 0 0 0 0
In [25]:
fig=px.density_mapbox(df,lat='Lat',lon='Long',hover_name='Country',hover_data=['Confirmed','Recovered','Deaths'],animation_frame='Date',color_continuous_scale='Portland',radius=7,zoom = 0,height=700)
fig.update_layout(title='Worldwide COVID-19 Cases with Time laps')
fig.update_layout(mapbox_style='open-street-map',mapbox_center_lon=0)
fig.show()

Cases over time and area plot

In [26]:
temp=df.groupby('Date')['Confirmed','Deaths','Recovered','Active',].sum().reset_index()
temp=temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
tm=temp.melt(id_vars='Date',value_vars=['Active','Deaths','Recovered'])
fig=px.treemap(tm,path=['variable'],values='value',height=250,width=800,color_discrete_sequence=[act,rec,dth])

fig.data[0].textinfo='label+text+value'
fig.show()
In [27]:
temp=df.groupby('Date')['Recovered','Deaths','Active',].sum().reset_index()
temp=temp.melt(id_vars='Date',value_vars=['Recovered','Deaths','Active'],var_name='Case',value_name='Count')

fig=px.area(temp,x='Date',y='Count',color='Case',height=600,title='Cases over time',color_discrete_sequence=[rec,dth,act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

WORLDWIDE CASES ON MAPS

In [28]:
temp=df[df['Date']==max(df['Date'])]
m=folium.Map(location=[0,0],tiles='cartodbpositron',min_zoom=1,max_zoom=4,zoom_start=1)
for i in range(0,len(temp)):
    folium.Circle(location=[temp.iloc[i]['Lat'],temp.iloc[i]['Long']],color= 'crimson',fill='crimson',
                            tooltip='<li><bold> Country:'+str(temp.iloc[i]['Country'])+
                            '<li><bold> Province:'+str(temp.iloc[i]['Province/State'])+
                            '<li><bold> Confirmed:'+str(temp.iloc[i]['Confirmed'])+
                            '<li><bold> Deaths:'+str(temp.iloc[i]['Deaths']),
                        radius =int(temp.iloc[i]['Confirmed'])**0.5).add_to(m)
m
Out[28]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [29]:
fig=px.choropleth(country_daywise,locations='Country',locationmode='country names',color=country_daywise['Confirmed'],
                 hover_name='Country',animation_frame=country_daywise['Date'].dt.strftime('%Y-%m-%d'),
                 title='Cases over time',color_continuous_scale=px.colors.sequential.Inferno)
fig.update(layout_coloraxis_showscale=True)
fig.show()

Deaths and Recoveries per 100 Cases

In [30]:
fig_c=px.bar(daywise,x='Date',y='Confirmed',color_discrete_sequence=[act])
fig_d=px.bar(daywise,x='Date',y='Deaths',color_discrete_sequence=[dth])

fig=make_subplots(rows=1,cols=2,shared_xaxes=False,horizontal_spacing=0.1,
                 subplot_titles=('Confirmed Cases','Death Cases'))

fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)

fig.update_layout(height=500)
fig.show()
In [31]:
daywise.columns
Out[31]:
Index(['Date', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'New Cases',
       'Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered',
       'No. of Countries'],
      dtype='object')
In [32]:
fig1=px.line(daywise,x='Date',y='Deaths / 100 Cases',color_discrete_sequence=[dth])
fig2=px.line(daywise,x='Date',y='Recovered / 100 Cases',color_discrete_sequence=[rec])
fig3=px.line(daywise,x='Date',y='Deaths / 100 Recovered',color_discrete_sequence=['aqua'])

fig=make_subplots(rows=1,cols=3,shared_xaxes=False,
                 subplot_titles=('Deaths / 100 Cases','Recovered / 100 Cases','Deaths / 100 Recovered'))

fig.add_trace(fig1['data'][0],row=1,col=1)
fig.add_trace(fig2['data'][0],row=1,col=2)
fig.add_trace(fig3['data'][0],row=1,col=3)

fig.update_layout(height=400)
fig.show()

Confirmed and Death Cases with ColorMap

In [33]:
fig_c=px.choropleth(countrywise,locations='Country',locationmode='country names',
                   color=np.log(countrywise['Confirmed']), hover_name='Country',
                   hover_data=['Confirmed'])

temp=countrywise[countrywise['Deaths']>0]
fig_d=px.choropleth(temp,locations='Country',locationmode='country names',
                   color=np.log(temp['Deaths']), hover_name='Country',
                   hover_data=['Deaths'])

fig=make_subplots(rows=1,cols=2,subplot_titles=['Confirmed','Deaths'],
                specs=[[{'type':'choropleth'},{'type':'choropleth'}]]    )

fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)

fig.update(layout_coloraxis_showscale=False)
fig.update_layout(height=1000,width=1000)

fig.show()

New Cases and Number of Countries

In [34]:
fig_c=px.bar(daywise,x='Date',y='Confirmed',color_discrete_sequence=[act])
fig_d=px.bar(daywise,x='Date',y='No. of Countries',color_discrete_sequence=[dth])

fig=make_subplots(rows=1,cols=2,shared_xaxes=False,horizontal_spacing=0.1,
                 subplot_titles=('No. of new Cases per day','No. of Countries'))

fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)

fig.update_layout(height=400)
fig.show()

Top 15 Countries Case Analysis

In [35]:
top=15

fig_c=px.bar(countrywise.sort_values('Confirmed').tail(top),x='Confirmed',y='Country',
            text='Confirmed',orientation='h',color_discrete_sequence=[act])
fig_d=px.bar(countrywise.sort_values('Deaths').tail(top),x='Deaths',y='Country',
            text='Deaths',orientation='h',color_discrete_sequence=[dth])


fig_a=px.bar(countrywise.sort_values('Active').tail(top),x='Active',y='Country',
            text='Active',orientation='h',color_discrete_sequence=['#434343'])
fig_r=px.bar(countrywise.sort_values('Recovered').tail(top),x='Recovered',y='Country',
            text='Recovered',orientation='h',color_discrete_sequence=[rec])


fig_dc=px.bar(countrywise.sort_values('Deaths / 100 Cases').tail(top),x='Deaths / 100 Cases',y='Country',
            text='Deaths / 100 Cases',orientation='h',color_discrete_sequence=['#f84351'])
fig_rc=px.bar(countrywise.sort_values('Recovered / 100 Cases').tail(top),x='Recovered / 100 Cases',y='Country',
            text='Recovered / 100 Cases',orientation='h',color_discrete_sequence=['#a45398'])


fig_nc=px.bar(countrywise.sort_values('New Cases').tail(top),x='New Cases',y='Country',
            text='New Cases',orientation='h',color_discrete_sequence=['#f04341'])
temp=countrywise[countrywise['Population']>1000000]
fig_p=px.bar(temp.sort_values('Cases / Million People').tail(top),x='Cases / Million People',y='Country',
            text='Cases / Million People',orientation='h',color_discrete_sequence=['#b40398'])


fig_wc=px.bar(countrywise.sort_values('1 week change').tail(top),x='1 week change',y='Country',
            text='1 week change',orientation='h',color_discrete_sequence=['#c04041'])
temp=countrywise[countrywise['Confirmed']>100]
fig_wi=px.bar(temp.sort_values('1 week % increase').tail(top),x='1 week % increase',y='Country',
            text='1 week % increase',orientation='h',color_discrete_sequence=['#b00398'])



fig=make_subplots(rows=5,cols=2,shared_xaxes=False,horizontal_spacing=0.2,vertical_spacing=.05,
                  subplot_titles=('Confirmed Cases','Deaths Reported','Recovered Cases','Active Cases','Deaths / 100 Cases','Recovered / 100 Cases','New Cases','Cases / Million People','1 week change','1 week % increase'))

fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)
fig.add_trace(fig_r['data'][0],row=2,col=1)
fig.add_trace(fig_a['data'][0],row=2,col=2)
fig.add_trace(fig_dc['data'][0],row=3,col=1)
fig.add_trace(fig_rc['data'][0],row=3,col=2)
fig.add_trace(fig_nc['data'][0],row=4,col=1)
fig.add_trace(fig_p['data'][0],row=4,col=2)
fig.add_trace(fig_wc['data'][0],row=5,col=1)
fig.add_trace(fig_wi['data'][0],row=5,col=2)

fig.update_layout(height=4000)
fig.show()

Comparison of COVID-19 with other EPIDEMICS

In [36]:
#Wikipedia Source

epidemics=pd.DataFrame({
    'epidemic':['COVID-19','SARS','EBOLA','MERS','H1N1'],
    'start_year':[2019,2002,2013,2012,2009],
    'end_year':[2020,2004,2016,2020,2010],
    'confirmed':[countrywise['Confirmed'].sum(),8422,28646,2519,6724149],
    'deaths':[countrywise['Deaths'].sum(),813,11323,866,19654]
})
epidemics['mortality']=round((epidemics['deaths']/epidemics['confirmed'])*100,2)
epidemics.head()
Out[36]:
epidemic start_year end_year confirmed deaths mortality
0 COVID-19 2019 2020 19861683 731326 3.68
1 SARS 2002 2004 8422 813 9.65
2 EBOLA 2013 2016 28646 11323 39.53
3 MERS 2012 2020 2519 866 34.38
4 H1N1 2009 2010 6724149 19654 0.29
In [37]:
temp=epidemics.melt(id_vars='epidemic',value_vars=['confirmed','deaths','mortality'],var_name='Case',value_name='Value')
fig=px.bar(temp,x='epidemic',y='Value',color='epidemic',text='Value',facet_col='Case',color_discrete_sequence=px.colors.qualitative.Bold)

fig.update_traces(textposition='outside')

fig.update_layout(uniformtext_minsize=8,uniformtext_mode='hide')
fig.update_yaxes(showticklabels=False)
fig.layout.yaxis2.update(matches=None)
fig.layout.yaxis3.update(matches=None)
fig.show()
In [38]:
data_corona=pd.read_csv("total_cases.csv")
data_corona.head()
Out[38]:
date World Afghanistan Albania Algeria Andorra Angola Anguilla Antigua and Barbuda Argentina ... United States Virgin Islands Uruguay Uzbekistan Vatican Venezuela Vietnam Western Sahara Yemen Zambia Zimbabwe
0 12/31/2019 27 0 0 0 0 0 0 0 0 ... 0.0 0 0.0 0 0.0 0 0 0 0 0
1 1/1/2020 27 0 0 0 0 0 0 0 0 ... 0.0 0 0.0 0 0.0 0 0 0 0 0
2 1/2/2020 27 0 0 0 0 0 0 0 0 ... 0.0 0 0.0 0 0.0 0 0 0 0 0
3 1/3/2020 44 0 0 0 0 0 0 0 0 ... 0.0 0 0.0 0 0.0 0 0 0 0 0
4 1/4/2020 44 0 0 0 0 0 0 0 0 ... 0.0 0 0.0 0 0.0 0 0 0 0 0

5 rows × 212 columns

In [39]:
cols=['date','Italy','Spain','Australia','Brazil','India','Colombia','Pakistan','Mexico','Peru','United States','Russia','South Africa','Chile','Iran','Argentina','United Kingdom','Saudi Arabia','Bangladesh',
     'Pakistan','Turkey','France','Germany']
Subsetdf=data_corona[cols]
Subsetdf.set_index("date",inplace=True)
In [40]:
bcr.bar_chart_race(df=Subsetdf,filename=None,figsize=(5,4.5),title='COVID-19 Cases by Country')
Out[40]:
In [ ]:
 
In [ ]:
 

COVID-19 in INDIA

In [41]:
data_corona=pd.read_csv("india_daywise.csv")
data_corona.head()
Out[41]:
date Andaman and Nicobar Islands Andhra Pradesh Arunachal Pradesh Assam Bihar Chandigarh Chhattisgarh Dadar Nagar Haveli Delhi ... Puducherry Punjab Rajasthan Sikkim Tamil Nadu Telangana Tripura Uttar Pradesh Uttarakhand West Bengal
0 30/01/2020 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 31/01/2020 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1/2/2020 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 2/2/2020 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 3/2/2020 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 36 columns

In [42]:
cols=['date','Andaman and Nicobar Islands','Andhra Pradesh','Arunachal Pradesh','Assam','Bihar','Chandigarh','Chhattisgarh','Dadar Nagar Haveli',
      'Delhi','Goa','Gujarat','Haryana','Himachal Pradesh','Jammu and Kashmir','Jharkhand','Karnataka','Kerala','Ladakh','Madhya Pradesh',
     'Maharashtra','Manipur','Meghalaya','Mizoram','Nagaland','Odisha','Puducherry','Punjab','Rajasthan','Sikkim','Tamil Nadu','Telangana','Tripura'
,'Uttar Pradesh','Uttarakhand','West Bengal']

Subsetdf=data_corona[cols]
Subsetdf.set_index("date",inplace=True)
Subsetdf.tail(10)
Out[42]:
Andaman and Nicobar Islands Andhra Pradesh Arunachal Pradesh Assam Bihar Chandigarh Chhattisgarh Dadar Nagar Haveli Delhi Goa ... Puducherry Punjab Rajasthan Sikkim Tamil Nadu Telangana Tripura Uttar Pradesh Uttarakhand West Bengal
date
14/08/20 2037 264142 2512 71795 94193 1842 13937 1752 149460 10494 ... 6680 27936 57414 931 320355 90259 6767 140775 11302 107323
15/08/20 2186 273085 2607 74501 98008 1928 14481 1797 150652 10970 ... 6995 29013 58692 1080 326245 91361 6934 145287 11615 110358
16/08/20 2306 281817 2658 75558 101551 2009 14987 1843 151928 11339 ... 7354 30041 59979 1148 332105 92255 7061 150061 11940 113432
17/08/20 2399 289829 2701 76875 103844 2102 15471 1878 152580 11639 ... 7732 31206 61296 1167 338055 93937 7204 154418 12175 116498
18/08/20 2445 296609 2741 79667 106307 2216 16025 1908 153367 11994 ... 8029 32696 62630 1187 343945 95700 7409 158216 12493 119578
19/08/20 2529 306261 2875 82201 109498 2305 16833 1951 154741 12333 ... 8396 34400 63977 1207 349654 97424 7645 162434 12961 122753
20/08/20 2604 316003 2950 84317 112437 2396 17485 1995 156139 12675 ... 8750 36084 65289 1232 355449 99391 7835 167510 13225 125922
21/08/20 2680 325396 3066 86052 114941 2515 18501 2030 157354 13099 ... 9292 37824 66619 1290 361435 101865 8091 172334 13636 129119
22/08/20 2747 334940 3126 87908 117413 2631 19510 2082 158604 13484 ... 9594 39327 67954 1336 367430 104249 8371 177239 14083 132364
23/08/20 2808 345216 3223 89468 119529 2776 20078 2120 160016 13790 ... 10112 40643 69264 1381 373410 57142 8702 182453 14566 135596

10 rows × 35 columns

In [43]:
bcr.bar_chart_race(df=Subsetdf,filename=None,figsize=(5,4.5),title='COVID-19 Cases Statewise in INDIA')
Out[43]:
In [44]:
dp=df.query('Country=="India"')
dp.head()
Out[44]:
Date Province/State Country Lat Long Confirmed Recovered Deaths Active
29172 2020-01-22 India 20.593684 78.96288 0 0 0 0
29173 2020-01-23 India 20.593684 78.96288 0 0 0 0
29174 2020-01-24 India 20.593684 78.96288 0 0 0 0
29175 2020-01-25 India 20.593684 78.96288 0 0 0 0
29176 2020-01-26 India 20.593684 78.96288 0 0 0 0
In [45]:
dp['Country']=dp['Country'].fillna("")
dp.isnull().sum()
dp.head()
confirmed=dp.groupby('Date').sum()['Confirmed'].reset_index()
recovered=dp.groupby('Date').sum()['Recovered'].reset_index()
deaths=dp.groupby('Date').sum()['Deaths'].reset_index()
In [46]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=confirmed['Date'],y=confirmed['Confirmed'],mode='lines+markers',name='Confirmed',line=dict(color="Orange",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=recovered['Recovered'],mode='lines+markers',name='Recovered',line=dict(color="Green",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=deaths['Deaths'],mode='lines+markers',name='Deaths',line=dict(color="Red",width=4)))
fig.update_layout(title='India COVID19 Cases',xaxis_tickfont_size=14,yaxis=dict(title='Number of cases'))
fig.show()
In [47]:
temp=dp.groupby('Date')['Confirmed','Deaths','Recovered','Active',].sum().reset_index()
temp=temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
tm=temp.melt(id_vars='Date',value_vars=['Active','Deaths','Recovered'])
fig=px.treemap(tm,path=['variable'],values='value',height=250,width=800,color_discrete_sequence=[act,rec,dth])

fig.data[0].textinfo='label+text+value'
fig.show()

Age Group Analysis

In [8]:
age_details = pd.read_csv('AgeGroupDetails.csv')
india_covid_19 = pd.read_csv('covid_19_india.csv')


labels = list(age_details['AgeGroup'])
sizes = list(age_details['TotalCases'])

explode = []

for i in labels:
    explode.append(0.05)
    
plt.figure(figsize= (15,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=9, explode =explode)
centre_circle = plt.Circle((0,0),0.70,fc='white')

fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('India - Age Group wise Distribution',fontsize = 20)
plt.axis('equal')  
plt.tight_layout()
In [11]:
#We could see that the age group <40 is the most affected which is against the trend which says elderly people are more at risk of being affected. Only 17% of people >60 are affected
In [3]:
train_data = pd.read_csv("train.csv")#index_col=0
display(train_data.head())
test_data = pd.read_csv("test.csv")#index_col=0
display(test_data.head())
Id Province_State Country_Region Date ConfirmedCases Fatalities
0 1 NaN Afghanistan 2020-01-22 0.0 0.0
1 2 NaN Afghanistan 2020-01-23 0.0 0.0
2 3 NaN Afghanistan 2020-01-24 0.0 0.0
3 4 NaN Afghanistan 2020-01-25 0.0 0.0
4 5 NaN Afghanistan 2020-01-26 0.0 0.0
ForecastId Province_State Country_Region Date
0 1 NaN Afghanistan 2020-04-02
1 2 NaN Afghanistan 2020-04-03
2 3 NaN Afghanistan 2020-04-04
3 4 NaN Afghanistan 2020-04-05
4 5 NaN Afghanistan 2020-04-06
In [4]:
sum_df = pd.pivot_table(train_data, values=['ConfirmedCases','Fatalities'], index=['Date'],aggfunc=np.sum)
display(sum_df.max())
ConfirmedCases    4540926.0
Fatalities         307611.0
dtype: float64

Lets create some new features, such as

-Daily Confirmed cases

-Daily Fatalities

-Growth factor (ratio of daily new cases to the previous day)

-Mortality rate (ratio of fatalities to the confirmed cases)

In [6]:
train_data['NewConfirmedCases'] = train_data['ConfirmedCases'] - train_data['ConfirmedCases'].shift(1)
train_data['NewConfirmedCases'] = train_data['NewConfirmedCases'].fillna(0.0)
train_data['NewFatalities']     = train_data['Fatalities'] - train_data['Fatalities'].shift(1)
train_data['NewFatalities']     = train_data['NewFatalities'].fillna(0.0)#.astype(int)
train_data['MortalityRate']     = train_data['Fatalities'] / train_data['ConfirmedCases']
train_data['MortalityRate']     = train_data['MortalityRate'].fillna(0.0)
train_data['GrowthRate']        = train_data['NewConfirmedCases']/train_data['NewConfirmedCases'].shift(1)
train_data['GrowthRate']        = train_data['GrowthRate'].replace([-np.inf, np.inf],  0.0)
train_data['GrowthRate']        = train_data['GrowthRate'].fillna(0.0) 
display(train_data.head())
Id Province_State Country_Region Date ConfirmedCases Fatalities NewConfirmedCases NewFatalities MortalityRate GrowthRate
0 1 NaN Afghanistan 2020-01-22 0.0 0.0 0.0 0.0 0.0 0.0
1 2 NaN Afghanistan 2020-01-23 0.0 0.0 0.0 0.0 0.0 0.0
2 3 NaN Afghanistan 2020-01-24 0.0 0.0 0.0 0.0 0.0 0.0
3 4 NaN Afghanistan 2020-01-25 0.0 0.0 0.0 0.0 0.0 0.0
4 5 NaN Afghanistan 2020-01-26 0.0 0.0 0.0 0.0 0.0 0.0
In [7]:
def getColumnInfo(df):
    n_province =  df['Province_State'].nunique()
    n_country  =  df['Country_Region'].nunique()
    n_days     =  df['Date'].nunique()
    start_date =  df['Date'].unique()[0]
    end_date   =  df['Date'].unique()[-1]
    return n_province, n_country, n_days, start_date, end_date

n_train = train_data.shape[0]
n_test = test_data.shape[0]

n_prov_train, n_count_train, n_train_days, start_date_train, end_date_train = getColumnInfo(train_data)
n_prov_test,  n_count_test,  n_test_days,  start_date_test,  end_date_test  = getColumnInfo(test_data)

print ('<==Train data==> \n # of Province_State: '+str(n_prov_train),', # of Country_Region:'+str(n_count_train), 
       ', Time Period: '+str(start_date_train)+' to '+str(end_date_train), '==> days:',str(n_train_days))
print("\n Countries with Province/State information:  ", train_data[train_data['Province_State'].isna()==False]['Country_Region'].unique())
print ('\n <==Test  data==> \n # of Province_State: '+str(n_prov_test),', # of Country_Region:'+str(n_count_test),
       ', Time Period: '+start_date_test+' to '+end_date_test, '==> days:',n_test_days)

df_test = test_data.loc[test_data.Date > '2020-04-14']
overlap_days = n_test_days - df_test.Date.nunique()
print('\n overlap days with training data: ', overlap_days, ', total days: ', n_train_days+n_test_days-overlap_days)
<==Train data==> 
 # of Province_State: 133 , # of Country_Region:184 , Time Period: 2020-01-22 to 2020-05-15 ==> days: 115

 Countries with Province/State information:   ['Australia' 'Canada' 'China' 'Denmark' 'France' 'Netherlands' 'US'
 'United Kingdom']

 <==Test  data==> 
 # of Province_State: 133 , # of Country_Region:184 , Time Period: 2020-04-02 to 2020-05-14 ==> days: 43

 overlap days with training data:  13 , total days:  145

We need to do the predictions for 43 days, with overlap with of 13 days in training data that we will use to test our forecast model. Lets look at the data records with entries greater than zero.

In [9]:
prob_confirm_check_train = train_data.ConfirmedCases.value_counts(normalize=True)
prob_fatal_check_train = train_data.Fatalities.value_counts(normalize=True)

n_confirm_train = train_data.ConfirmedCases.value_counts()[1:].sum()
n_fatal_train = train_data.Fatalities.value_counts()[1:].sum()

print('Percentage of confirmed case records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_confirm_train, n_train, prob_confirm_check_train[1:].sum()*100))
print('Percentage of fatality records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_fatal_train, n_train, prob_fatal_check_train[1:].sum()*100))
Percentage of confirmed case records = 23521/35995 = 65.3%
Percentage of fatality records = 15706/35995 = 43.6%

Trend by Country/Region for the maximum cases

In [10]:
train_data_by_country = train_data.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum', 'Fatalities': 'sum',
                                                                                         'GrowthRate':'last' })
#display(train_data_by_country.tail(10))
max_train_date = train_data['Date'].max()
train_data_by_country_confirm = train_data_by_country.query('(Date == @max_train_date) & (ConfirmedCases > 100)').sort_values('ConfirmedCases', ascending=False)
train_data_by_country_confirm.set_index('Country_Region', inplace=True)

train_data_by_country_confirm.style.background_gradient(cmap='Reds').format({'ConfirmedCases': "{:.0f}", 'GrowthRate': "{:.2f}"})
Out[10]:
Date ConfirmedCases Fatalities GrowthRate
Country_Region
US 2020-05-15 1442653 87525.000000 1.15
Russia 2020-05-15 262843 2418.000000 1.06
United Kingdom 2020-05-15 238004 34078.000000 1.03
Spain 2020-05-15 230183 27459.000000 0.76
Italy 2020-05-15 223885 31610.000000 0.80
Brazil 2020-05-15 220291 14962.000000 1.31
France 2020-05-15 179630 27532.000000 0.83
Germany 2020-05-15 175233 7897.000000 1.99
Turkey 2020-05-15 146457 4055.000000 1.04
Iran 2020-05-15 116635 6902.000000 1.16
India 2020-05-15 85784 2753.000000 0.96
Peru 2020-05-15 84495 2392.000000 0.91
China 2020-05-15 84038 4637.000000 0.00
Canada 2020-05-15 75945 5678.000000 0.00
Belgium 2020-05-15 54644 8959.000000 1.16
Saudi Arabia 2020-05-15 49176 292.000000 1.13
Mexico 2020-05-15 45032 4767.000000 1.01
Netherlands 2020-05-15 43880 5662.000000 0.74
Chile 2020-05-15 39542 394.000000 0.94
Pakistan 2020-05-15 38799 834.000000 6.14
Ecuador 2020-05-15 31467 2594.000000 60.31
Switzerland 2020-05-15 30514 1878.000000 1.02
Qatar 2020-05-15 29425 14.000000 0.67
Sweden 2020-05-15 29207 3646.000000 0.93
Portugal 2020-05-15 28583 1190.000000 1.41
Belarus 2020-05-15 27730 156.000000 1.01
Singapore 2020-05-15 26891 21.000000 1.05
Ireland 2020-05-15 23956 1518.000000 0.30
United Arab Emirates 2020-05-15 21831 210.000000 1.07
Bangladesh 2020-05-15 20065 298.000000 1.15
Poland 2020-05-15 18016 907.000000 0.98
Ukraine 2020-05-15 17330 476.000000 1.14
Israel 2020-05-15 16589 266.000000 0.32
Indonesia 2020-05-15 16496 1076.000000 0.86
Romania 2020-05-15 16437 1070.000000 0.78
Japan 2020-05-15 16203 713.000000 1.17
Austria 2020-05-15 16109 628.000000 0.84
Colombia 2020-05-15 14216 546.000000 0.89
South Africa 2020-05-15 13524 247.000000 1.18
Kuwait 2020-05-15 12860 96.000000 0.93
Philippines 2020-05-15 12091 806.000000 0.83
Dominican Republic 2020-05-15 11739 424.000000 3.38
Egypt 2020-05-15 11228 592.000000 1.00
Korea, South 2020-05-15 11037 262.000000 0.70
Denmark 2020-05-15 10989 537.000000 1.70
Serbia 2020-05-15 10438 225.000000 0.81
Panama 2020-05-15 9268 266.000000 0.86
Czechia 2020-05-15 8406 295.000000 0.67
Norway 2020-05-15 8219 232.000000 1.10
Argentina 2020-05-15 7479 356.000000 1.35
Australia 2020-05-15 7035 98.000000 0.00
Malaysia 2020-05-15 6855 112.000000 0.90
Morocco 2020-05-15 6652 190.000000 0.47
Algeria 2020-05-15 6629 536.000000 0.99
Bahrain 2020-05-15 6583 12.000000 1.01
Finland 2020-05-15 6228 293.000000 0.91
Afghanistan 2020-05-15 6053 153.000000 1.00
Moldova 2020-05-15 5745 202.000000 1.31
Kazakhstan 2020-05-15 5689 34.000000 0.77
Ghana 2020-05-15 5638 28.000000 0.89
Nigeria 2020-05-15 5450 171.000000 1.51
Oman 2020-05-15 4625 20.000000 0.88
Armenia 2020-05-15 4044 52.000000 1.30
Luxembourg 2020-05-15 3923 104.000000 0.73
Bolivia 2020-05-15 3577 164.000000 0.92
Hungary 2020-05-15 3417 442.000000 0.95
Iraq 2020-05-15 3193 117.000000 0.45
Cameroon 2020-05-15 3105 140.000000 0.98
Thailand 2020-05-15 3025 56.000000 7.00
Azerbaijan 2020-05-15 2980 36.000000 0.83
Greece 2020-05-15 2810 160.000000 4.00
Uzbekistan 2020-05-15 2686 11.000000 1.24
Guinea 2020-05-15 2473 15.000000 0.00
Honduras 2020-05-15 2460 134.000000 2.25
Senegal 2020-05-15 2310 25.000000 1.44
Bosnia and Herzegovina 2020-05-15 2236 128.000000 0.49
Croatia 2020-05-15 2222 95.000000 0.12
Bulgaria 2020-05-15 2138 102.000000 1.23
Cote d'Ivoire 2020-05-15 2017 24.000000 0.78
Sudan 2020-05-15 1964 91.000000 0.00
Cuba 2020-05-15 1840 79.000000 0.50
Iceland 2020-05-15 1802 10.000000 0.00
Estonia 2020-05-15 1766 63.000000 1.14
North Macedonia 2020-05-15 1740 97.000000 0.59
Guatemala 2020-05-15 1643 30.000000 0.71
Lithuania 2020-05-15 1523 54.000000 2.00
New Zealand 2020-05-15 1498 21.000000 0.00
Slovakia 2020-05-15 1480 27.000000 0.38
Slovenia 2020-05-15 1465 103.000000 1.00
Djibouti 2020-05-15 1309 4.000000 1.56
Congo (Kinshasa) 2020-05-15 1298 50.000000 0.77
Somalia 2020-05-15 1284 53.000000 0.00
El Salvador 2020-05-15 1210 25.000000 1.31
Gabon 2020-05-15 1209 10.000000 1.05
Kyrgyzstan 2020-05-15 1111 14.000000 0.76
Tunisia 2020-05-15 1035 45.000000 0.00
Maldives 2020-05-15 1031 4.000000 1.81
Latvia 2020-05-15 970 19.000000 0.73
Kosovo 2020-05-15 944 29.000000 0.00
Sri Lanka 2020-05-15 935 9.000000 1.00
Albania 2020-05-15 916 31.000000 1.00
Guinea-Bissau 2020-05-15 913 3.000000 0.00
Cyprus 2020-05-15 910 17.000000 1.50
Lebanon 2020-05-15 891 26.000000 0.62
Niger 2020-05-15 885 51.000000 0.56
Costa Rica 2020-05-15 843 9.000000 0.87
Mali 2020-05-15 806 46.000000 1.29
Kenya 2020-05-15 781 45.000000 1.10
Burkina Faso 2020-05-15 780 51.000000 0.00
Andorra 2020-05-15 761 49.000000 0.00
Paraguay 2020-05-15 759 11.000000 0.36
Uruguay 2020-05-15 732 19.000000 1.60
Diamond Princess 2020-05-15 712 13.000000 0.00
Georgia 2020-05-15 671 12.000000 0.20
Zambia 2020-05-15 654 7.000000 0.00
San Marino 2020-05-15 652 41.000000 0.80
Jordan 2020-05-15 596 9.000000 2.50
Equatorial Guinea 2020-05-15 594 7.000000 0.18
Malta 2020-05-15 532 6.000000 0.71
Jamaica 2020-05-15 511 9.000000 0.00
Tanzania 2020-05-15 509 21.000000 0.00
Venezuela 2020-05-15 459 10.000000 0.12
Sierra Leone 2020-05-15 447 27.000000 1.86
Taiwan* 2020-05-15 440 7.000000 0.00
Chad 2020-05-15 428 48.000000 1.07
Congo (Brazzaville) 2020-05-15 391 15.000000 0.00
West Bank and Gaza 2020-05-15 375 2.000000 0.00
Benin 2020-05-15 339 2.000000 0.00
Mauritius 2020-05-15 332 10.000000 0.00
Cabo Verde 2020-05-15 326 2.000000 0.42
Montenegro 2020-05-15 324 9.000000 0.00
Vietnam 2020-05-15 314 0.000000 0.08
Haiti 2020-05-15 310 20.000000 0.95
Central African Republic 2020-05-15 301 0.000000 0.00
Rwanda 2020-05-15 287 0.000000 0.00
Ethiopia 2020-05-15 287 5.000000 1.67
Nepal 2020-05-15 267 0.000000 -18.00
Togo 2020-05-15 263 11.000000 1.32
Madagascar 2020-05-15 238 0.000000 0.44
South Sudan 2020-05-15 236 4.000000 0.00
Sao Tome and Principe 2020-05-15 235 7.000000 0.00
Liberia 2020-05-15 219 20.000000 2.00
Uganda 2020-05-15 203 0.000000 2.05
Eswatini 2020-05-15 190 2.000000 0.00
Burma 2020-05-15 182 6.000000 0.00
Brunei 2020-05-15 141 1.000000 0.00
Cambodia 2020-05-15 122 0.000000 0.00
Mozambique 2020-05-15 119 0.000000 0.36
Trinidad and Tobago 2020-05-15 116 8.000000 0.00
Guyana 2020-05-15 116 10.000000 0.00
In [11]:
discrete_col = list(islice(cycle(['orange', 'r', 'g', 'k', 'b', 'c', 'm']), None, len(train_data_by_country_confirm.head(30))))
plt.rcParams.update({'font.size': 22})
train_data_by_country_confirm.head(20).plot(figsize=(20,15), kind='barh', color=discrete_col)
plt.legend(["Confirmed Cases", "Fatalities"]);
plt.xlabel("Number of Covid-19 Affectees")
plt.title("First 20 Countries with Highest Confirmed Cases")
ylocs, ylabs = plt.yticks()
for i, v in enumerate(train_data_by_country_confirm.head(20)["ConfirmedCases"][:]):
    plt.text(v+0.01, ylocs[i]-0.25, str(int(v)), fontsize=12)
for i, v in enumerate(train_data_by_country_confirm.head(20)["Fatalities"][:]):
    if v > 0: #disply for only >300 fatalities
        plt.text(v+0.01,ylocs[i]+0.1,str(int(v)),fontsize=12) 

Trend by Date (Time Series Diagnosis)

Below are the plots of confirmed cases and fatalities for nations with fatalities > 600. In global case, I am also overlaying the new emerging confirmed cases as well as daily deaths, respectively.

In [13]:
import matplotlib.dates as dates
def reformat_time(reformat, ax):
    ax.xaxis.set_major_locator(dates.WeekdayLocator())
    ax.xaxis.set_major_formatter(dates.DateFormatter('%b %d'))    
    if reformat: #reformat again if you wish
        date_list = train_data_by_date.reset_index()["Date"].tolist()
        x_ticks = [dt.datetime.strftime(t,'%Y-%m-%d') for t in date_list]
        x_ticks = [tick for i,tick in enumerate(x_ticks) if i%8==0 ]# split labels into same number of ticks as by pandas
        ax.set_xticklabels(x_ticks, rotation=90)
    # cosmetics
    ax.yaxis.grid(linestyle='dotted')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.spines['left'].set_color('none')
    ax.spines['bottom'].set_color('none')

train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data_by_date = train_data.groupby(['Date'],as_index=True).agg({'ConfirmedCases': 'sum','Fatalities': 'sum', 
                                                                     'NewConfirmedCases':'sum', 'NewFatalities':'sum', 'MortalityRate':'mean'})
num0 = train_data_by_date._get_numeric_data() 
num0[num0 < 0.0] = 0.0
#display(train_data_by_date.head())

## ======= Sort by countries with fatalities > 600 ========

train_data_by_country_max = train_data.groupby(['Country_Region'],as_index=True).agg({'ConfirmedCases': 'max', 'Fatalities': 'max'})
train_data_by_country_fatal = train_data_by_country_max[train_data_by_country_max['Fatalities']>600]
train_data_by_country_fatal = train_data_by_country_fatal.sort_values(by=['Fatalities'],ascending=False).reset_index()
#display(train_data_by_country_fatal.head(20))
df_merge_by_country = pd.merge(train_data,train_data_by_country_fatal['Country_Region'],on=['Country_Region'],how='inner')
df_max_fatality_country = df_merge_by_country.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum',
                                                                                                     'Fatalities': 'sum',
                                                                                                     'NewConfirmedCases':'sum',
                                                                                                     'NewFatalities':'sum',
                                                                                                     'MortalityRate':'mean'})

num1 = df_max_fatality_country._get_numeric_data() 
num1[num1 < 0.0] = 0.0
df_max_fatality_country.set_index('Date',inplace=True)
#display(df_max_fatality_country.head(20))

countries = train_data_by_country_fatal['Country_Region'].unique()

plt.rcParams.update({'font.size': 16})

fig,(ax0,ax1) = plt.subplots(1,2,figsize=(15, 8))
fig,(ax2,ax3) = plt.subplots(1,2,figsize=(15, 8))#,sharey=True)

train_data_by_date.ConfirmedCases.plot(ax=ax0, x_compat=True, title='Confirmed Cases Globally', legend='Confirmed Cases',
                                       color=discrete_col)#, logy=True)
reformat_time(0,ax0)
train_data_by_date.NewConfirmedCases.plot(ax=ax0, x_compat=True, linestyle='dotted', legend='New Confirmed Cases',
                                          color=discrete_col)#, logy=True)
reformat_time(0,ax0)
train_data_by_date.Fatalities.plot(ax=ax2, x_compat=True, title='Fatalities Globally', legend='Fatalities', color='r')
reformat_time(0,ax2)
train_data_by_date.NewFatalities.plot(ax=ax2, x_compat=True, linestyle='dotted', legend='Daily Deaths',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax2)

for country in countries:
    match = df_max_fatality_country.Country_Region==country
    df_fatality_by_country = df_max_fatality_country[match] 
    df_fatality_by_country.ConfirmedCases.plot(ax=ax1, x_compat=True, title='Confirmed Cases Nationally')
    reformat_time(0,ax1)
    df_fatality_by_country.Fatalities.plot(ax=ax3, x_compat=True, title='Fatalities Nationally')
    reformat_time(0,ax3)
    
#ax1.legend(countries)
#ax3.legend(countries)
ax1.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
ax3.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
Out[13]:
<matplotlib.legend.Legend at 0x22baa8aa470>

Confirmed Cases: It can be seen that a raise in maximum number of new cases appears in week of Feb 11-18, after which China reached its saturation point. Then a new sudden rise appears after March 24th, when the total new cases world wide crosses the total affectees in China alone.

Deaths: As can be seen, since March 11th, the death toll rises steeply due to extreme rise in European countires, specially Italy, Spain, France and UK, and as well as now in US. The average mortality rate in these countries below can explain the peaks in the global mortality rate.

In [19]:
fig = plt.figure()
fig,(ax4,ax5) = plt.subplots(1,2,figsize=(20, 8))
#train_data_by_date.loc[(train_data_by_date.ConfirmedCases > 20000)]#useless, its already summed.
train_data_by_date.MortalityRate.plot(ax=ax4, x_compat=True, legend='Mortality Rate',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax4)

for num, country in enumerate(countries):
    match = df_max_fatality_country.Country_Region==country 
    df_fatality_by_country = df_max_fatality_country[match] 
    df_fatality_by_country.MortalityRate.plot(ax=ax5, x_compat=True, title='Average Mortality Rate Nationally')    
    reformat_time(0,ax5)

ax5.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
Out[19]:
<matplotlib.legend.Legend at 0x22bab886588>
<Figure size 432x288 with 0 Axes>
In [ ]: